# Clear all objects (from the workspace)
rm(list = ls())

# Suppress Warning messages
options(warn = -1)

# Turn off scientific notation like 1e+06
# options(scipen=999)

options(stringsAsFactors = F)

# Load Libs

# # INSTALL with:
# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install("DESeq2")

library(tidyr)
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum
library(limma)
## 
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
## 
##     plotMA
## The following object is masked from 'package:BiocGenerics':
## 
##     plotMA
# LOAD provided functions
source("./script_ejercicios.R")
# LOAD data/matrixes
count_mat = read.table("./count_mat.txt", header = TRUE, sep = "\t")
FPKM_mat = read.table("./FPKM_mat.txt", header = TRUE, sep = "\t")

# Let's take a LOOK
count_mat[10000:10010, ]
FPKM_mat[10000:10010, ]
# FORMAT row names
count_mat = count_mat %>% unite("rowNames", geneId:geneName, remove = TRUE)
FPKM_mat = FPKM_mat %>% unite("rowNames", geneId:geneName, remove = TRUE)
row.names(count_mat) = count_mat[, 1]
row.names(FPKM_mat) = FPKM_mat[, 1]

# DROP extra data
drops <- c("rowNames", "type", "specific_type")
count_mat = count_mat[, !(names(count_mat) %in% drops)]
FPKM_mat = FPKM_mat[, !(names(FPKM_mat) %in% drops)]

#count_mat["geneId"]
# gsub(” “, “_”,(trimws(gsub(“\\_+”, ” “, stringVec3))))
# row.names(count_mat) <- count_mat[]

# Let's take a LOOK
count_mat[10000:10010, ]
FPKM_mat[10000:10010, ]
# FILTRAR genes - conteos

# Para cada gen, contar el número de muestras con mayor a 5 conteos
count_mat_filter <-
    apply(count_mat, 1, function(x)
        length(which(x >= 5)))
table(count_mat_filter)
## count_mat_filter
##     0     1     2     3     4     5     6     7     8     9 
## 19546   882   681   727   474   447   562   640   686 11226
# Filtrar genes con menor a 2 muestras con más de 5 conteos
count_mat <- count_mat[which(count_mat_filter >= 2),]

# Let's take a LOOK
dim(count_mat)
## [1] 15443     9
count_mat[10000:10010, ]
# FILTRAR genes - FPKM

# DO log base 2
FPKM_mat <- glog(FPKM_mat)

#Para cada gen, contar el número de muestras con expresión mayor a 2
FPKM_mat_filter <-
    apply(FPKM_mat, 1, function(x)
        length(which(x >= 2)))
#Filtrar genes con menor a 2 muestras con expresión mayor a 2
FPKM_mat <- FPKM_mat[which(FPKM_mat_filter >= 2), ]

# Let's take a LOOK
dim(FPKM_mat)
## [1] 15787     9
FPKM_mat[10000:10010, ]
# Let's see what genes share count_mat and FPKM_mat
length(intersect(rownames(count_mat),rownames(FPKM_mat)))
## [1] 14068

Conteos [28 vs 0 días]

count_mat_28v0 = count_mat[,-grep('d14_', colnames(count_mat))]

# Conteos
aux_classes = rep(0, times = ncol(count_mat_28v0)) # CLASSIFY "d0_" as 0s
aux_classes[grep(pattern = "d28_", x = colnames(count_mat_28v0))] = 1
aux_classes
## [1] 0 0 0 1 1
colnames(count_mat_28v0)
## [1] "d0_r1"   "d0_r2"   "d0_r4"   "d28_r8"  "d28_r14"
count_results = DESeq_func(matrix_c = count_mat_28v0, classes_c = aux_classes)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
count_results = count_results[order(count_results$pvalue),]
count_results
# HISTOGRAM
pvals = count_results["pvalue"]
hist(
    pvals[, 1],
    prob = TRUE,
    col = "black",
    border = "white",
    xlab = "scores",
    breaks = 100
)
box(bty = "l")
# Draw density function (assuming normal dist)
score_mean = mean(pvals[, 1])
score_sd   = sd(pvals[, 1])
curve(
    dnorm(x, mean = score_mean, sd = score_sd),
    add = TRUE,
    col = "red",
    lwd = 2
)

# Let's take a look to some genes
count_mat_28v0["ENSG00000128567.16_PODXL",]
count_mat_28v0["ENSG00000185559.14_DLK1",]

Conteos [14 vs 0 días]

count_mat_14v0 = count_mat[,-grep('d28_', colnames(count_mat))]

# Conteos
aux_classes = rep(0, times = ncol(count_mat_14v0)) # CLASSIFY "d0_" as 0s
aux_classes[grep(pattern = "d14_", x = colnames(count_mat_14v0))] = 1
aux_classes
## [1] 0 0 0 1 1 1 1
colnames(count_mat_14v0)
## [1] "d0_r1"   "d0_r2"   "d0_r4"   "d14_r7"  "d14_r8"  "d14_r9"  "d14_r10"
count_results = DESeq_func(matrix_c = count_mat_14v0, classes_c = aux_classes)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
count_results = count_results[order(count_results$pvalue),]
count_results
# HISTOGRAM
pvals = count_results["pvalue"]
hist(
    pvals[, 1],
    prob = TRUE,
    col = "black",
    border = "white",
    xlab = "scores",
    breaks = 100
)
box(bty = "l")
# Draw density function (assuming normal dist)
score_mean = mean(pvals[, 1])
score_sd   = sd(pvals[, 1])
curve(
    dnorm(x, mean = score_mean, sd = score_sd),
    add = TRUE,
    col = "red",
    lwd = 2
)

# Let's take a look to some genes
count_mat_14v0["ENSG00000265992.1_ESRG",]
count_mat_14v0["ENSG00000185559.14_DLK1",]

FPKM [28 vs 0 días]

FPKM_mat_28v0 = FPKM_mat[,-grep('d14_', colnames(FPKM_mat))]

# FPKM
aux_classes = rep(0, times = ncol(FPKM_mat_28v0)) # CLASSIFY "d0_" as 0s
aux_classes[grep(pattern = "d28_", x = colnames(FPKM_mat_28v0))] = 1
aux_classes
## [1] 0 0 0 1 1
colnames(FPKM_mat_28v0)
## [1] "d0_r1"   "d0_r2"   "d0_r4"   "d28_r8"  "d28_r14"
fpkm_results =
    limma4DS_fdr(
        matrix_e = FPKM_mat_28v0,
        classes_e = aux_classes,
        classes_names = c("d0_", "d28_")
    )
## toptable() is deprecated and will be removed in the future version of limma. Please use topTable() instead.
fpkm_results = fpkm_results[order(fpkm_results$p.value),]
fpkm_results
# HISTOGRAM
pvals = fpkm_results["p.value"]
hist(
    pvals[, 1],
    prob = TRUE,
    col = "black",
    border = "white",
    xlab = "scores",
    breaks = 100
)
box(bty = "l")
# Draw density function (assuming normal dist)
score_mean = mean(pvals[, 1])
score_sd   = sd(pvals[, 1])
curve(
    dnorm(x, mean = score_mean, sd = score_sd),
    add = TRUE,
    col = "red",
    lwd = 2
)

# Let's take a look to some genes
FPKM_mat_28v0["ENSG00000261780.2_LINC02582",]
FPKM_mat_28v0["ENSG00000197406.7_DIO3",]

FPKM [14 vs 0 días]

FPKM_mat_14v0 = FPKM_mat[,-grep('d28_', colnames(FPKM_mat))]

# FPKM
aux_classes = rep(0, times = ncol(FPKM_mat_14v0)) # CLASSIFY "d0_" as 0s
aux_classes[grep(pattern = "d14_", x = colnames(FPKM_mat_14v0))] = 1
aux_classes
## [1] 0 0 0 1 1 1 1
colnames(FPKM_mat_14v0)
## [1] "d0_r1"   "d0_r2"   "d0_r4"   "d14_r7"  "d14_r8"  "d14_r9"  "d14_r10"
fpkm_results =
    limma4DS_fdr(
        matrix_e = FPKM_mat_14v0,
        classes_e = aux_classes,
        classes_names = c("d0_", "d14_")
    )
## toptable() is deprecated and will be removed in the future version of limma. Please use topTable() instead.
fpkm_results = fpkm_results[order(fpkm_results$p.value),]
fpkm_results
# HISTOGRAM
pvals = fpkm_results["p.value"]
hist(
    pvals[, 1],
    prob = TRUE,
    col = "black",
    border = "white",
    xlab = "scores",
    breaks = 100
)
box(bty = "l")
# Draw density function (assuming normal dist)
score_mean = mean(pvals[, 1])
score_sd   = sd(pvals[, 1])
curve(
    dnorm(x, mean = score_mean, sd = score_sd),
    add = TRUE,
    col = "red",
    lwd = 2
)

# Let's take a look to some genes
FPKM_mat_14v0["ENSG00000253507.5_AC104257.1",]
FPKM_mat_14v0["ENSG00000185559.14_DLK1",]
# Enable Warning messages
options(warn = 0)